Aminet 1 (Walnut Creek)

home *** CD-ROM | disk | FTP | other *** search

/ Aminet 1 (Walnut Creek) / Aminet - June 1993 [Walnut Creek].iso / usenet / sources / volume91 / libraris / sregexp1 / part02 / sregexp.c < prev next >

Wrap

C/C++ Source or Header | 1992-04-20 | 17KB | 663 lines

/* This is a routine to test if a string matches a regular expression. * * * The regular expression sytax is a slight extension of the regular * AmigaDos wildcard patterns. Apparently 2.0 has added a few new things * (well, at least a not operator) to the game. This may or may not * be consistant with those additions (it's the way they otta done it.) * * Here it is: * * ? matches any one character * * #(pat) matches zero or more occurances of the pattern pat. * * (pat) seperates out a piece of the pattern. * * pat1|pat2 matches to either pat1 or pat2 * * ' escapes the special meaning of these symbols. * * % matches the null string. * * EXTENSIONS * * [chars] will match any single character in the set chars, * specified as single characters or ranges seperated * by a -. Ex. [af-i+] would match a, f, g, h, i, and * +. If ~ is the first character in the set then the * set matched is the complement of the set specified. * Ex. [~a-z] would match any one character that is * not a (lower case if case sensitivity is on) letter. * Note that a [~a] is NOT the same as a ~[a]. The former * would match a 'b' but not a 'ab', whereas the latter * would match both. * * ~(pat) will match anything that doesn't match the pattern. * Note: it is illegal to repeat a not. ie #~a is illegal. * (So is #(~(a)) so don't even try it.) You can repeat * something with a not IN it, as long as it can't be * reduced to the form #~(pattern). (A #[~a] is OK.) * However ~#a has the expected effect (matches any * non-null string not composed entirely of a's.) * * * same as #?. * * */ #include "sregexp.h" const char wilds[] = { /* string of the wildcards, for easy access */ CHR_REPEAT, CHR_NOT, CHR_OPENBRACE, CHR_CLOSEBRACE, CHR_OPENSET, CHR_CLOSESET, CHR_ANYCHAR, CHR_NULL, CHR_OR, CHR_ESCAPE, /* CHR_RANGE, <--- special case */ CHR_STAR, 0 }; #ifdef __DEBUG__ extern void puts(char *); extern void printf(char *, ...); #endif struct SregExp * parsesregexp(expr) char *expr; /* This is the entry point into the parsing subroutines. */ { return parsesub(&expr,0); } static struct SregExp * parsesub(p,end) char **p,end; /* This routine will parse a sub expression up until the character 'end' is encontered. This is 0 for the whole pattern and ')' for a subpattern */ { struct SregList *list,*head=NULL,*orlist,*orhead=NULL; struct SregExp *sreg; int num = 0,ornum = 0; while (**p != end) { if (**p == CHR_OR) { /* deal with stuff like (ab||cd) == (ab|%|cd) */ if (!(sreg = makenull())) goto cleanup; } else { if (!(sreg = parseone(p,TRUE))) /* parse one wildcard */ goto cleanup; } if (head) { /* if there's already a list, just stick it on */ if (!(list = (list->srl_next = getmem(sizeof(struct SregList))))) { report(MEM_ERROR); goto cleanup; } } else { /* otherwise, make a new list */ if (!(list = (head = getmem(sizeof(struct SregList))))) { report(MEM_ERROR); goto cleanup; } } list->srl_sreg = sreg; list->srl_next = NULL; num++; if (**p == CHR_OR) { /* deal with an or */ if (!(sreg = makesum(head,num))) goto cleanup; if (orhead) { /* either add onto the or list */ if (!(orlist = (orlist->srl_next = getmem(sizeof(struct SregList))))) { report(MEM_ERROR); goto cleanup; } } else { /* or make a new or list */ if (!(orlist = (orhead = getmem(sizeof(struct SregList))))) { report(MEM_ERROR); goto cleanup; } } orlist->srl_sreg = sreg; orlist->srl_next = NULL; ornum++; (*p)++; head = NULL; num = 0; } } if (head) { if (!(sreg = makesum(head,num))) goto cleanup; } else { if (!(sreg = makenull())) goto cleanup; } head = NULL; if (orhead) { /* if this expression had an or, clean it up */ if (!(orlist = (orlist->srl_next = getmem(sizeof(struct SregList))))) { report(MEM_ERROR); freesregexp(sreg); goto cleanup; } orlist->srl_sreg = sreg; orlist->srl_next = NULL; ornum++; if (!(sreg = makeor(orhead,ornum))) goto cleanup; } return sreg; /* sub expression successfully matched. */ cleanup: /* troubles all head here to clean up */ list = head; while (list) { /* free all the bits of the current sum */ head = list->srl_next; freesregexp(list->srl_sreg); freemem(list,sizeof(struct SregList)); list = head; } list = orhead; while (list) { /* and all of the current or parts */ head = list->srl_next; freesregexp(list->srl_sreg); freemem(list,sizeof(struct SregList)); list = head; } return NULL; /* return the failure */ } static struct SregExp * makesum(list,num) struct SregList *list; int num; /* This routine takes a linked list of sreg's and joins them together Under the auspices of one big SRP_SUM type sreg */ { struct SregList *lnk; struct SregExp *sreg; int i,len=0; char fixed = SRF_FIXLEN; if (num == 0) { report(ILLEGAL_ERR); return NULL; } else if (num == 1) { sreg = list->srl_sreg; freemem(list,sizeof(struct SregList)); return sreg; } if (!(sreg = getmem(sizeof(struct SregExp) + num*sizeof(struct SregExp *)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_SUM; sreg->sre_Flag = 0; sreg->sre_Data.number = num; for (i = 0; i < num; i++) { len += realen(list->srl_sreg); fixed &= isfixed(list->srl_sreg) ? SRF_FIXLEN : 0; sreg->sre_List[i] = list->srl_sreg; lnk = list->srl_next; freemem(list,sizeof(struct SregList)); list = lnk; } sreg->sre_MinLen = len; sreg->sre_Flag |= fixed; return sreg; } static struct SregExp * makeor(list,num) struct SregList *list; int num; /* This does basically the same thing as makesum, except it makes an SRP_OR type sreg to join the list and doesn't deal with some special cases */ { struct SregList *lnk; struct SregExp *sreg; int i,len; char fixed = SRF_FIXLEN; if (!(sreg = getmem(sizeof(struct SregExp) + num*sizeof(struct SregExp *)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_OR; sreg->sre_Flag = 0; sreg->sre_Data.number = num; for (i = 0; i < num; i++) { if (i == 0) len = realen(list->srl_sreg); else if (len != realen(list->srl_sreg)) { fixed = 0; if (len > realen(list->srl_sreg)) len = realen(list->srl_sreg); } fixed &= isfixed(list->srl_sreg) ? SRF_FIXLEN : 0; sreg->sre_List[i] = list->srl_sreg; lnk = list->srl_next; freemem(list,sizeof(struct SregList)); list = lnk; } sreg->sre_MinLen = len; sreg->sre_Flag |= fixed; return sreg; } static struct SregExp * parseone(p,cat) char **p,cat; /* This routine parses one wildcard character from the string *p, leaving it set up pointing to the begining of the next wildcard. If 'cat' is true then a string of characters will be grouped together as a SRP_STRING type sreg. It may be false because of the scope rules of ~ and #, which only repeat the very next pattern. */ { struct SregExp *sreg; switch (**p) { case (CHR_REPEAT) : (*p)++; if (!(sreg = parseone(p,FALSE))) return NULL; if (sreg->sre_Flag & SRF_NOT) { report(ILLEGAL_ERR); freesregexp(sreg); return NULL; } sreg->sre_Flag |= SRF_REPEAT; break; case (CHR_NOT) : (*p)++; if (!(sreg = parseone(p,FALSE))) return NULL; sreg->sre_Flag |= SRF_NOT; break; case (CHR_OPENBRACE) : (*p)++; sreg = parsesub(p,CHR_CLOSEBRACE); (*p)++; break; case (CHR_OPENSET) : (*p)++; if (!(sreg = getmem(sizeof(struct SregExp)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_SETCHAR; sreg->sre_Flag = SRF_FIXLEN; sreg->sre_MinLen = 1; if (!(sreg->sre_Data.setchar = makeset(p))) { freemem(sreg,sizeof(sreg)); return NULL; } (*p)++; break; case (CHR_ANYCHAR) : (*p)++; if (!(sreg = getmem(sizeof(struct SregExp)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_ANYCHAR; sreg->sre_Flag = SRF_FIXLEN; sreg->sre_MinLen = 1; break; case (CHR_STAR) : (*p)++; if (!(sreg = getmem(sizeof(struct SregExp)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_ANYCHAR; sreg->sre_Flag = SRF_FIXLEN | SRF_REPEAT; sreg->sre_MinLen = 1; break; case (CHR_NULL) : (*p)++; if (!(sreg = makenull())) return NULL; break; case (CHR_CLOSEBRACE) : case (CHR_CLOSESET) : case (CHR_OR) : case (0) : report(ILLEGAL_ERR); return NULL; default : if (!(sreg = getmem(sizeof(struct SregExp)))) { report(MEM_ERROR); return NULL; } sreg->sre_Flag = SRF_FIXLEN; if (!cat) { sreg->sre_Data.onechar = onechar(p,FALSE); sreg->sre_Type = SRP_ONECHAR; sreg->sre_MinLen = 1; } else { char *q=*p; int len=0; while (onechar(&q,FALSE)) len++; sreg->sre_MinLen = len; if (len == 1) { sreg->sre_Data.onechar = onechar(p,FALSE); sreg->sre_Type = SRP_ONECHAR; } else { sreg->sre_Type = SRP_STRING; if (!(q = sreg->sre_Data.string = getmem(len+1))) { report(MEM_ERROR); freemem(sreg,sizeof(sreg)); return NULL; } while (*q++ = onechar(p,FALSE)) ; } } } return sreg; } static struct SregExp * makenull() /* This routine makes an node matching the null string. */ { struct SregExp *sreg; if (!(sreg = getmem(sizeof(struct SregExp)))) { report(MEM_ERROR); return NULL; } sreg->sre_Type = SRP_NULL; sreg->sre_Flag = SRF_FIXLEN; sreg->sre_MinLen = 0; } static char onechar(p,minus) char **p,minus; /* this routine picks one character off the string *p. It handles escaping the special meaning of the wildcard characters, and returns 0 if we're at the end of the string or the next char is a wildcard. if 'minus' is true, then the '-' is treated as a wildcard, otherwise it isn't */ { if (**p == CHR_ESCAPE) (*p)++; else if (strchr(wilds,**p) || (minus && **p == CHR_RANGE)) return 0; return *(*p)++; } static char * makeset(p) char **p; /* this makes a table of match flags from the character specifier that goes in between [ and ]. It handles a leading '~' and leaves *p pointing to the closing brace. Note: This routine checks to make sure the specifier ends in a ] and fails if it doesn't */ { char *set,not=FALSE,ok=FALSE,s,e; if (**p == CHR_NOT) { (*p)++; not = TRUE; } if (!(set = getmem(32))) { report(MEM_ERROR); return NULL; } for (s = 0; s < 32; s++) set[s] = 0; while (s = onechar(p,TRUE)) { ok = TRUE; if (**p == CHR_RANGE) { (*p)++; if (!(e = onechar(p,TRUE))) { report(ILLEGAL_ERR); freemem(set,32); return NULL; } for ( ; s <= e; s++) set[s/8] |= 1 << (s % 8); } else set[s/8] |= 1 << (s % 8); } if (!ok || **p != CHR_CLOSESET) { report(ILLEGAL_ERR); freemem(set,32); return NULL; } if (not) { for(s = 0; s < 32; s++) set[s] = ~set[s]; } return set; } void freesregexp(sreg) struct SregExp *sreg; /* this routine frees up a sreg. It correctly handles freeing the subpattern elements in a SRP_SUM or SRP_OR sreg and frees the match table or string from a SRP_SETCHAR or SRP_STRING sreg. This is one of the externally visible routines. */ { int add = 0; if (sreg->sre_Type == SRP_SUM || sreg->sre_Type == SRP_OR) { int i; add = sreg->sre_Data.number * sizeof(struct SregExp *); for (i = 0; i < sreg->sre_Data.number; i++) freesregexp(sreg->sre_List[i]); } else if (sreg->sre_Type == SRP_STRING) freemem(sreg->sre_Data.string,strlen(sreg->sre_Data.string)+1); else if (sreg->sre_Type == SRP_SETCHAR) freemem(sreg->sre_Data.setchar,32); freemem(sreg,sizeof(struct SregExp) + add); } int matchsregexp(p,sreg,up) char *p; struct SregExp *sreg; int up; /* This is the externally visible stub to the pattern matching part of sreg. 'p' is the string to match to, sreg is the preparsed sreg, and up indicates if the match is case sensitive. */ { return matchnsregexp(p,sreg,up,strlen(p)); } int matchnsregexp(p,sreg,up,len) char *p; struct SregExp *sreg; int up,len; /* This routine will match a sreg to a string of length 'len'. */ /* The length, rather than NULL terminated is used to make a lot of things a lot simpler */ { int res,i; if (sreg->sre_Flag & SRF_REPEAT) { /* deal with repeaters. */ char not,*h; if (len == 0 || sreg->sre_Type == SRP_ANYCHAR) { /* always true */ res = TRUE; goto end; } /* check if the lengths match up */ if (len < sreg->sre_MinLen || (sreg->sre_Flag & SRF_FIXLEN && (sreg->sre_MinLen == 0 || len % sreg->sre_MinLen != 0))) { res = FALSE; goto end; } not = sreg->sre_Flag & SRF_NOT; sreg->sre_Flag &= ~(SRF_REPEAT | SRF_NOT); if (sreg->sre_Flag & SRF_FIXLEN) { /* handle fixed lenght matches */ h = p; while (h-p < len) { if (!(res = matchnsregexp(h,sreg,up,sreg->sre_MinLen))) goto end; h += sreg->sre_MinLen; } } else { /* and variable length ones */ for (i = len; i >= (sreg->sre_MinLen ? sreg->sre_MinLen : 1); i--) { if (res = matchnsregexp(p,sreg,up,i)) { sreg->sre_Flag |= SRF_REPEAT; if (res = matchnsregexp(p+i,sreg,up,len-i)) break; sreg->sre_Flag &= ~SRF_REPEAT; } } } sreg->sre_Flag |= SRF_REPEAT | not; goto end; } /* check the lengths first for quick rejection */ if (len < sreg->sre_MinLen || (sreg->sre_Flag & SRF_FIXLEN && len != sreg->sre_MinLen)) { res = FALSE; goto end; } switch (sreg->sre_Type) { case (SRP_SUM) : res = matchsum(&(sreg->sre_List[0]),sreg->sre_Data.number,p,len,up); break; case (SRP_ONECHAR) : res = len == 1 && (up ? *p == sreg->sre_Data.onechar : toupper(*p) == toupper(sreg->sre_Data.onechar)); break; case (SRP_SETCHAR) : res = len == 1 && matchset(sreg,*p) || (up ? FALSE : (isupper(*p) ? matchset(sreg,tolower(*p)) : matchset(sreg,toupper(*p)))); break; case (SRP_ANYCHAR) : res = len == 1; break; case (SRP_STRING) : if (up) res = !strncmp(p,sreg->sre_Data.string,len); else res = !strnicmp(p,sreg->sre_Data.string,len); break; case (SRP_NULL) : res = len == 0; break; case (SRP_OR) : for (i = 0; i < sreg->sre_Data.number; i++) if (res = matchnsregexp(p,sreg->sre_List[i],up,len)) { break; } break; } end: if (sreg->sre_Flag & SRF_NOT) return !res; else return res; } static int matchsum(list,num,p,len,up) struct SregExp *list[]; int num,len,up; char *p; /* This routine is called by match to match the elements of a sum of sregs. It tries to be as effecient as possible, and keep recursion to a minimum. It will match any fixed length peices at the begining and end first, then match any fixed length peices in the middle to break the string up. Only then does it do the tiresome matches to the non-fixed length peices. */ { int i,res,o; while (num && isfixed(list[0])) { if (!(res = matchnsregexp(p,list[0],up,list[0]->sre_MinLen))) return FALSE; p += list[0]->sre_MinLen; len -= list[0]->sre_MinLen; list++; num--; } while (num && isfixed(list[num-1])) { if (!(res = matchnsregexp(p+len-list[num-1]->sre_MinLen,list[num-1],up, list[num-1]->sre_MinLen))) return FALSE; len -= list[num-1]->sre_MinLen; num--; } if (num == 0) return TRUE; o = 0; for (i = 1; i < num-1; i++) { if (isfixed(list[i])) { for ( ; len-o >= list[i]->sre_MinLen; o++) { if (res = matchnsregexp(p+o,list[i],up,list[i]->sre_MinLen)) { if (!(res = matchsum(list,i,p,o,up))) return FALSE; if (!(res = matchsum(list+(i+1),num-i-1, p+o+list[i]->sre_MinLen,len-o-list[i]->sre_MinLen,up))) return FALSE; return TRUE; } } return FALSE; } else o += realen(list[i]); } if (num == 1) return matchnsregexp(p,list[0],up,len); for (o = len; o >= list[0]->sre_MinLen; o--) if (matchnsregexp(p,list[0],up,o) && matchsum(list+1,num-1,p+o,len-o,up)) return TRUE; return FALSE; } int iswild(p) char *p; /* this is a very simple routine, but it is externally visible. Returns true if the string has wildcard characters (unescaped) false if not */ { while (onechar(&p,FALSE)) ; return *p != 0; } void report(i) int i; /* This routine now does set the IoErr value to more information. For now it does nothing. */ { struct Process *myself; if (myself = (struct Process *)FindTask(0)) myself->pr_Result2 = i; }